{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>bmi</th>\n",
       "      <th>children</th>\n",
       "      <th>smoker</th>\n",
       "      <th>region</th>\n",
       "      <th>charges</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>female</td>\n",
       "      <td>27.900</td>\n",
       "      <td>0</td>\n",
       "      <td>yes</td>\n",
       "      <td>southwest</td>\n",
       "      <td>16884.92400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>male</td>\n",
       "      <td>33.770</td>\n",
       "      <td>1</td>\n",
       "      <td>no</td>\n",
       "      <td>southeast</td>\n",
       "      <td>1725.55230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28</td>\n",
       "      <td>male</td>\n",
       "      <td>33.000</td>\n",
       "      <td>3</td>\n",
       "      <td>no</td>\n",
       "      <td>southeast</td>\n",
       "      <td>4449.46200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>33</td>\n",
       "      <td>male</td>\n",
       "      <td>22.705</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "      <td>northwest</td>\n",
       "      <td>21984.47061</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>32</td>\n",
       "      <td>male</td>\n",
       "      <td>28.880</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "      <td>northwest</td>\n",
       "      <td>3866.85520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>31</td>\n",
       "      <td>female</td>\n",
       "      <td>25.740</td>\n",
       "      <td>0</td>\n",
       "      <td>no</td>\n",
       "      <td>southeast</td>\n",
       "      <td>3756.62160</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age     sex     bmi  children smoker     region      charges\n",
       "0   19  female  27.900         0    yes  southwest  16884.92400\n",
       "1   18    male  33.770         1     no  southeast   1725.55230\n",
       "2   28    male  33.000         3     no  southeast   4449.46200\n",
       "3   33    male  22.705         0     no  northwest  21984.47061\n",
       "4   32    male  28.880         0     no  northwest   3866.85520\n",
       "5   31  female  25.740         0     no  southeast   3756.62160"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "data = pd.read_csv('./data/insurance.csv', sep=',')\n",
    "data.head(n=6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EDA 数据探索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "% matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([536., 398., 129.,  86.,  35.,  59.,  57.,  32.,   2.,   4.]),\n",
       " array([ 1121.8739  ,  7386.729311, 13651.584722, 19916.440133,\n",
       "        26181.295544, 32446.150955, 38711.006366, 44975.861777,\n",
       "        51240.717188, 57505.572599, 63770.42801 ]),\n",
       " <a list of 10 Patch objects>)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEC5JREFUeJzt3XuMXOV5x/Hvr5hLmpu5GGTZThcUqwqRGqAWBVFFKfQCJgr8ESRQ1VjUlaWGSolSKTWN1DZS/4BUKgi1IrFCWlPlAiFJQYQ0QQ6oqapATLiHUC/EDSsj7JRLmkapSvL0j3k3GZvFO+udZXdefT/SaN7znHfnPEc+/PZwZs5sqgpJUr9+abkbkCQtLYNekjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1LlVy90AwEknnVRTU1PL3YYkTZQHHnjgB1W1Zr55KyLop6am2L1793K3IUkTJcl/jjLPSzeS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktS5FXFn7GJMbf/ysm177zUXL9u2JWlUntFLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnRgr6JHuTPJrkoSS7W+2EJHcn2dOej2/1JLkhyXSSR5KctZQ7IEk6vIWc0f9WVZ1RVZva8nZgV1VtBHa1ZYCLgI3tsQ24cVzNSpIWbjGXbi4BdrbxTuDSofrNNfBNYHWStYvYjiRpEUYN+gK+luSBJNta7ZSqehagPZ/c6uuAZ4Z+dqbVJEnLYNRvrzyvqvYlORm4O8l3DzM3c9TqFZMGvzC2AbzlLW8ZsQ1J0kKNdEZfVfva837gS8DZwHOzl2Ta8/42fQbYMPTj64F9c7zmjqraVFWb1qxZc+R7IEk6rHmDPsnrk7xxdgz8LvAYcAewpU3bAtzexncA72ufvjkHeGn2Eo8k6bU3yqWbU4AvJZmd/5mq+pck3wJuTbIV+D5wWZt/F7AZmAZ+DFw59q4lSSObN+ir6mngHXPU/wu4YI56AVeNpTtJ0qJ5Z6wkdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknq3MhBn+SoJA8mubMtn5rkviR7ktyS5JhWP7YtT7f1U0vTuiRpFAs5o/8A8MTQ8rXAdVW1EXgB2NrqW4EXquqtwHVtniRpmYwU9EnWAxcDn2zLAc4HbmtTdgKXtvElbZm2/oI2X5K0DEY9o78e+DDws7Z8IvBiVb3clmeAdW28DngGoK1/qc0/SJJtSXYn2X3gwIEjbF+SNJ9V801I8m5gf1U9kORds+U5ptYI635RqNoB7ADYtGnTK9ZPgqntX16W7e695uJl2a6kyTRv0APnAe9Jshk4DngTgzP81UlWtbP29cC+Nn8G2ADMJFkFvBl4fuydS5JGMu+lm6q6uqrWV9UUcDnw9ar6feAe4L1t2hbg9ja+oy3T1n+9qibyjF2SerCYz9H/GfChJNMMrsHf1Oo3ASe2+oeA7YtrUZK0GKNcuvm5qroXuLeNnwbOnmPOT4DLxtCbJGkMvDNWkjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalz8wZ9kuOS3J/k4SSPJ/loq5+a5L4ke5LckuSYVj+2LU+39VNLuwuSpMMZ5Yz+f4Hzq+odwBnAhUnOAa4FrquqjcALwNY2fyvwQlW9FbiuzZMkLZN5g74GftQWj26PAs4Hbmv1ncClbXxJW6atvyBJxtaxJGlBRrpGn+SoJA8B+4G7gaeAF6vq5TZlBljXxuuAZwDa+peAE8fZtCRpdCMFfVX9tKrOANYDZwNvm2tae57r7L0OLSTZlmR3kt0HDhwYtV9J0gIt6FM3VfUicC9wDrA6yaq2aj2wr41ngA0Abf2bgefneK0dVbWpqjatWbPmyLqXJM1rlE/drEmyuo1fB/w28ARwD/DeNm0LcHsb39GWaeu/XlWvOKOXJL02Vs0/hbXAziRHMfjFcGtV3ZnkO8Dnkvw18CBwU5t/E/BPSaYZnMlfvgR9S5JGNG/QV9UjwJlz1J9mcL3+0PpPgMvG0p0kadG8M1aSOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1bt6gT7IhyT1JnkjyeJIPtPoJSe5Osqc9H9/qSXJDkukkjyQ5a6l3QpL06kY5o38Z+NOqehtwDnBVktOB7cCuqtoI7GrLABcBG9tjG3Dj2LuWJI1s3qCvqmer6ttt/N/AE8A64BJgZ5u2E7i0jS8Bbq6BbwKrk6wde+eSpJEs6Bp9kingTOA+4JSqehYGvwyAk9u0dcAzQz8202qSpGUwctAneQPwBeCDVfXDw02do1ZzvN62JLuT7D5w4MCobUiSFmikoE9yNIOQ/3RVfbGVn5u9JNOe97f6DLBh6MfXA/sOfc2q2lFVm6pq05o1a460f0nSPEb51E2Am4Anqupvh1bdAWxp4y3A7UP197VP35wDvDR7iUeS9NpbNcKc84A/AB5N8lCr/TlwDXBrkq3A94HL2rq7gM3ANPBj4MqxdixJWpB5g76q/o25r7sDXDDH/AKuWmRfkqQx8c5YSeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUucMeknqnEEvSZ0z6CWpcwa9JHXOoJekzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM4Z9JLUuVXL3YAWbmr7l5dt23uvuXjZti3pyHhGL0mdM+glqXMGvSR1zqCXpM4Z9JLUOYNekjpn0EtS5+YN+iSfSrI/yWNDtROS3J1kT3s+vtWT5IYk00keSXLWUjYvSZrfKGf0/whceEhtO7CrqjYCu9oywEXAxvbYBtw4njYlSUdq3qCvqn8Fnj+kfAmws413ApcO1W+ugW8Cq5OsHVezkqSFO9Jr9KdU1bMA7fnkVl8HPDM0b6bVXiHJtiS7k+w+cODAEbYhSZrPuN+MzRy1mmtiVe2oqk1VtWnNmjVjbkOSNOtIg/652Usy7Xl/q88AG4bmrQf2HXl7kqTFOtKgvwPY0sZbgNuH6u9rn745B3hp9hKPJGl5zPs1xUk+C7wLOCnJDPCXwDXArUm2At8HLmvT7wI2A9PAj4Erl6BnSdICzBv0VXXFq6y6YI65BVy12KYkSePjHx7RgizXHz3xD55IR86vQJCkzhn0ktQ5g16SOmfQS1LnDHpJ6pxBL0mdM+glqXMGvSR1zqCXpM55Z6x0GMt1JzB4N7DGx6CXVii/bkLj4qUbSeqcQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI65+foNRGW88YladIZ9JIO4t3A/fHSjSR1zqCXpM4Z9JLUOYNekjpn0EtS5wx6SeqcQS9JnTPoJalzSxL0SS5M8mSS6STbl2IbkqTRjD3okxwF/D1wEXA6cEWS08e9HUnSaJbiKxDOBqar6mmAJJ8DLgG+swTbkqRF6/1rH5Yi6NcBzwwtzwC/sQTbkdQZv7xuaSxF0GeOWr1iUrIN2NYWf5TkyRFe+yTgB4vobblNev/gPqwEk94/TP4+jK3/XLuoH/+VUSYtRdDPABuGltcD+w6dVFU7gB0LeeEku6tq0+LaWz6T3j+4DyvBpPcPk78Pk9b/Unzq5lvAxiSnJjkGuBy4Ywm2I0kawdjP6Kvq5SR/AnwVOAr4VFU9Pu7tSJJGsyR/eKSq7gLuWoKXXtClnhVo0vsH92ElmPT+YfL3YaL6T9Ur3ieVJHXEr0CQpM5NRNCvtK9USPKpJPuTPDZUOyHJ3Un2tOfjWz1Jbmi9P5LkrKGf2dLm70myZaj+60kebT9zQ5K5PrK6mP43JLknyRNJHk/ygQnch+OS3J/k4bYPH231U5Pc1/q5pX0ggCTHtuXptn5q6LWubvUnk/zeUH3Jj7skRyV5MMmdE9r/3vbv/FCS3a02ScfR6iS3Jflu++/h3Enqf2RVtaIfDN7QfQo4DTgGeBg4fZl7eidwFvDYUO1jwPY23g5c28abga8wuL/gHOC+Vj8BeLo9H9/Gx7d19wPntp/5CnDRmPtfC5zVxm8E/oPB11VM0j4EeEMbHw3c13q7Fbi81T8O/HEbvx/4eBtfDtzSxqe3Y+pY4NR2rB31Wh13wIeAzwB3tuVJ638vcNIhtUk6jnYCf9TGxwCrJ6n/kfdzOTa6wH+Ic4GvDi1fDVy9Avqa4uCgfxJY28ZrgSfb+BPAFYfOA64APjFU/0SrrQW+O1Q/aN4S7cvtwO9M6j4Avwx8m8Ed2D8AVh167DD4FNi5bbyqzcuhx9PsvNfiuGNwj8ku4HzgztbPxPTfXncvrwz6iTiOgDcB36O9Vzlp/S/kMQmXbub6SoV1y9TL4ZxSVc8CtOeTW/3V+j9cfWaO+pJolwDOZHBGPFH70C57PATsB+5mcAb7YlW9PMd2f95rW/8ScOI8+7DUx931wIeBn7XlEyesfxjc9f61JA9kcLc7TM5xdBpwAPiHdvnsk0leP0H9j2wSgn6kr1RYwV6t/4XWxy7JG4AvAB+sqh8ebuqr9LSs+1BVP62qMxicGZ8NvO0w211R+5Dk3cD+qnpguHyYba6o/oecV1VnMfi22quSvPMwc1faPqxicAn2xqo6E/gfBpdqXs1K639kkxD0I32lwgrwXJK1AO15f6u/Wv+Hq6+foz5WSY5mEPKfrqovTuI+zKqqF4F7GVw3XZ1k9v6Q4e3+vNe2/s3A8yx838blPOA9SfYCn2Nw+eb6CeofgKra1573A19i8At3Uo6jGWCmqu5ry7cxCP5J6X90y3G9aIHX0VYxeHPjVH7xptLbV0BfUxx8jf5vOPgNnI+18cUc/AbO/a1+AoPrg8e3x/eAE9q6b7W5s2/gbB5z7wFuBq4/pD5J+7AGWN3GrwO+Abwb+DwHv5n5/ja+ioPfzLy1jd/OwW9mPs3gjczX7LgD3sUv3oydmP6B1wNvHBr/O3DhhB1H3wB+tY3/qvU+Mf2PvJ/LsdEj+MfYzOCTIU8BH1kB/XwWeBb4Pwa/tbcyuF66C9jTnmf/ocPgD7E8BTwKbBp6nT8EptvjyqH6JuCx9jN/xyFvFo2h/99k8L+QjwAPtcfmCduHXwMebPvwGPAXrX4ag086TDMIzWNb/bi2PN3Wnzb0Wh9pfT7J0KciXqvjjoODfmL6b70+3B6Pz25jwo6jM4Dd7Tj6ZwZBPTH9j/rwzlhJ6twkXKOXJC2CQS9JnTPoJalzBr0kdc6gl6TOGfSS1DmDXpI6Z9BLUuf+H0vF5h2vKbLBAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x26a4d355b38>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(data['charges'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([ 19.,  27.,  52.,  53.,  47.,  54.,  71.,  84.,  90., 108., 126.,\n",
       "        157.,  98.,  58.,  67.,  56.,  35.,  80.,  50.,   6.]),\n",
       " array([ 7.02275569,  7.22477015,  7.42678461,  7.62879907,  7.83081352,\n",
       "         8.03282798,  8.23484244,  8.4368569 ,  8.63887136,  8.84088581,\n",
       "         9.04290027,  9.24491473,  9.44692919,  9.64894365,  9.8509581 ,\n",
       "        10.05297256, 10.25498702, 10.45700148, 10.65901594, 10.86103039,\n",
       "        11.06304485]),\n",
       " <a list of 20 Patch objects>)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEqRJREFUeJzt3X+s3Xd93/Hna0kTCNMaB9/QYMd1qAwlICqy2ygFDWW4g0CqOF1BcvoDQ1NZtGlH203FDGnRJlVLumow2o3KJSlmQoEso01aQiFNodmkJtRJCbFj0pgkTYxNfCEQxpgChvf+OF9Xp7fHvvee7zn3XH/yfEhX53w/38/3fN/63q9f/t7P+f5IVSFJatc/mnUBkqTpMuglqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTt91gUArF+/vjZv3jzrMiTplHLvvfd+parmluq3JoJ+8+bN7N27d9ZlSNIpJcnfLqefQzeS1Lglgz7JjUmOJtm3qP1XkjyUZH+S3xpqf1eSg92810+jaEnS8i1n6OaDwO8CHzrekOSfA9uAV1TVM0nO7dovBLYDLwNeCPxZkhdX1XcnXbgkaXmWPKKvqruApxY1/yJwXVU90/U52rVvAz5SVc9U1aPAQeDiCdYrSVqhccfoXwz8syT3JPmLJD/atW8Anhjqd6hrkyTNyLhn3ZwOrAMuAX4UuDnJi4CM6DvyySZJdgI7ATZt2jRmGZKkpYx7RH8I+FgNfBb4HrC+az9/qN9G4PCoD6iq3VU1X1Xzc3NLngYqSRrTuEH/R8BrAZK8GDgD+ApwG7A9yZlJLgC2AJ+dRKGSpPEsOXST5CbgUmB9kkPAtcCNwI3dKZffBnbU4OGz+5PcDDwIHAOu8YwbSZqtrIWHg8/Pz5dXxqpFm3d9fOxlH7vu8glWohYlubeq5pfq55WxktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1Lglgz7JjUmOdo8NXDzv3ySpJOu76SR5X5KDST6f5KJpFC1JWr7lHNF/ELhscWOS84F/ATw+1PwGBg8E3wLsBN7fv0RJUh9LBn1V3QU8NWLWe4DfAIYfOrsN+FAN3A2cneS8iVQqSRrLWGP0Sa4AvlRV9y+atQF4Ymj6UNcmSZqR01e6QJKzgHcDrxs1e0RbjWgjyU4Gwzts2rRppWVIkpZpnCP6HwIuAO5P8hiwEbgvyQ8wOII/f6jvRuDwqA+pqt1VNV9V83Nzc2OUIUlajhUHfVU9UFXnVtXmqtrMINwvqqovA7cBb+nOvrkEeLqqjky2ZEnSSizn9MqbgL8EXpLkUJKrT9L9duAR4CDw+8AvTaRKSdLYlhyjr6qrlpi/eeh9Adf0L0uSNCleGStJjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGrfiRwlKzzabd3181iVIvXhEL0mNM+glqXEGvSQ1bjnPjL0xydEk+4ba/lOSLyT5fJI/THL20Lx3JTmY5KEkr59W4ZKk5VnOEf0HgcsWtd0BvLyqXgH8DfAugCQXAtuBl3XL/Lckp02sWknSii0Z9FV1F/DUorZPVdWxbvJuYGP3fhvwkap6pqoeBQ4CF0+wXknSCk1ijP7ngU907zcATwzNO9S1SZJmpFfQJ3k3cAz48PGmEd3qBMvuTLI3yd6FhYU+ZUiSTmLsoE+yA/gJ4Geq6niYHwLOH+q2ETg8avmq2l1V81U1Pzc3N24ZkqQljBX0SS4D3glcUVXfGpp1G7A9yZlJLgC2AJ/tX6YkaVxL3gIhyU3ApcD6JIeAaxmcZXMmcEcSgLur6u1VtT/JzcCDDIZ0rqmq706reEnS0pYM+qq6akTzDSfp/5vAb/YpSpI0OV4ZK0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxS94CQTrVbd718VmXIM2UR/SS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcUsGfZIbkxxNsm+o7ZwkdyR5uHtd17UnyfuSHEzy+SQXTbN4SdLSlnNE/0HgskVtu4A7q2oLcGc3DfAGBg8E3wLsBN4/mTIlSeNaMuir6i7gqUXN24A93fs9wJVD7R+qgbuBs5OcN6liJUkrN+4Y/Quq6ghA93pu174BeGKo36GuTZI0I5P+MjYj2mpkx2Rnkr1J9i4sLEy4DEnSceMG/ZPHh2S616Nd+yHg/KF+G4HDoz6gqnZX1XxVzc/NzY1ZhiRpKeMG/W3Aju79DuDWofa3dGffXAI8fXyIR5I0G0vevTLJTcClwPokh4BrgeuAm5NcDTwOvLnrfjvwRuAg8C3gbVOoWXpW6HPXzceuu3yClehUt2TQV9VVJ5i1dUTfAq7pW5QkaXK8MlaSGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUuCUvmJLWgj5XiUrPdh7RS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOM+j16rxXHhpNjyil6TG9Qr6JL+WZH+SfUluSvKcJBckuSfJw0k+muSMSRUrSVq5sYM+yQbgXwHzVfVy4DRgO3A98J6q2gJ8Dbh6EoVKksbTd+jmdOC5SU4HzgKOAK8Fbunm7wGu7LkOSVIPY38ZW1VfSvLbwOPA/wM+BdwLfL2qjnXdDgEbRi2fZCewE2DTpk3jliFJQL8v+x+77vIJVrL29Bm6WQdsAy4AXgg8D3jDiK41avmq2l1V81U1Pzc3N24ZkqQl9Bm6+XHg0apaqKrvAB8DXgWc3Q3lAGwEDvesUZLUQ5+gfxy4JMlZSQJsBR4EPg28qeuzA7i1X4mSpD7GDvqquofBl673AQ90n7UbeCfw60kOAs8HbphAnZKkMfW6MraqrgWuXdT8CHBxn8+VJE2OV8ZKUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxvngEa2IDw+RTj0e0UtS4wx6SWqcQS9JjTPoJalxfhkr6e/p+4V76w/xOBV5RC9JjTPoJalxBr0kNc6gl6TGGfSS1LheQZ/k7CS3JPlCkgNJfizJOUnuSPJw97puUsVKklau7xH9fwH+tKp+GPgR4ACwC7izqrYAd3bTkqQZGTvok/wT4DV0D/+uqm9X1deBbcCertse4Mq+RUqSxtfniP5FwALwB0n+OskHkjwPeEFVHQHoXs+dQJ2SpDH1CfrTgYuA91fVK4H/ywqGaZLsTLI3yd6FhYUeZUiSTqZP0B8CDlXVPd30LQyC/8kk5wF0r0dHLVxVu6tqvqrm5+bmepQhSTqZse91U1VfTvJEkpdU1UPAVuDB7mcHcF33eutEKpW0bD4gRsP63tTsV4APJzkDeAR4G4O/Em5OcjXwOPDmnuuQJPXQK+ir6nPA/IhZW/t8riRpcrxNsaSJ6jNs5C2Op8NbIEhS4wx6SWqcQzfPQp6RIT27GPQz4jimpNVi0PfgkbGkU4Fj9JLUOINekhrn0I2kZ72+w7Br/Xszj+glqXEGvSQ1zqCXpMY5Rn8K8rROSSvhEb0kNc6gl6TGGfSS1DiDXpIa1zvok5yW5K+T/Ek3fUGSe5I8nOSj3WMGJUkzMokj+ncAB4amrwfeU1VbgK8BV09gHZKkMfUK+iQbgcuBD3TTAV4L3NJ12QNc2WcdkqR++h7Rvxf4DeB73fTzga9X1bFu+hCwoec6JEk9jB30SX4COFpV9w43j+haJ1h+Z5K9SfYuLCyMW4YkaQl9juhfDVyR5DHgIwyGbN4LnJ3k+BW3G4HDoxauqt1VNV9V83Nzcz3KkCSdzNhBX1XvqqqNVbUZ2A78eVX9DPBp4E1dtx3Arb2rlCSNbRrn0b8T+PUkBxmM2d8whXVIkpZpIjc1q6rPAJ/p3j8CXDyJz5Uk9eeVsZLUOINekhrn/eglrRk+a2E6PKKXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGPeuvjPVKPEmt84hekhpn0EtS4wx6SWqcQS9JjTPoJalxYwd9kvOTfDrJgST7k7yjaz8nyR1JHu5e102uXEnSSvU5oj8G/OuqeilwCXBNkguBXcCdVbUFuLObliTNyNhBX1VHquq+7v3/AQ4AG4BtwJ6u2x7gyr5FSpLGN5Ex+iSbgVcC9wAvqKojMPjPADj3BMvsTLI3yd6FhYVJlCFJGqF30Cf5x8D/BH61qr6x3OWqandVzVfV/NzcXN8yJEkn0OsWCEm+j0HIf7iqPtY1P5nkvKo6kuQ84GjfIk/GWxhI0sn1OesmwA3Agar6z0OzbgN2dO93ALeOX54kqa8+R/SvBn4OeCDJ57q2fwtcB9yc5GrgceDN/UqUJPUxdtBX1f8GcoLZW8f9XEnSZHllrCQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuN6PXhEktTvAUiPXXf5BCsZzSN6SWqcQS9JjTPoJalxUwv6JJcleSjJwSS7prUeSdLJTSXok5wG/FfgDcCFwFVJLpzGuiRJJzetI/qLgYNV9UhVfRv4CLBtSuuSJJ3EtIJ+A/DE0PShrk2StMqmdR59RrTV3+uQ7AR2dpPfTPLQGOtZD3xljOWmaS3WBNa1EmuxJrCulViLNcGIunJ9r8/7weV0mlbQHwLOH5reCBwe7lBVu4HdfVaSZG9Vzff5jElbizWBda3EWqwJrGsl1mJNMLu6pjV081fAliQXJDkD2A7cNqV1SZJOYipH9FV1LMkvA58ETgNurKr901iXJOnkpnavm6q6Hbh9Wp/f6TX0MyVrsSawrpVYizWBda3EWqwJZlRXqmrpXpKkU5a3QJCkxq35oE/ykiSfG/r5RpJfXdQnSd7X3W7h80kuWgM1XZrk6aE+/26aNQ2t99eS7E+yL8lNSZ6zaP6ZST7abat7kmxeAzW9NcnC0Lb6hWnX1K33HV1N+xf//rr5q7pfraCuVdm3ktyY5GiSfUNt5yS5I8nD3eu6Eyy7o+vzcJIda6Sm7w5ts4meHHKCut7c/Q6/l+SEZ9qsyu1iquqU+WHwxe6XgR9c1P5G4BMMzt+/BLhnDdR0KfAnq7x9NgCPAs/tpm8G3rqozy8Bv9e93w58dA3U9Fbgd1d5W70c2AecxeC7qj8Dtsx6v1pmXauybwGvAS4C9g21/Rawq3u/C7h+xHLnAI90r+u69+tmWVM375urvK1eCrwE+Awwf4LlTgO+CLwIOAO4H7hw0vWt+SP6RbYCX6yqv13Uvg34UA3cDZyd5LwZ1zQrpwPPTXI6g7A4vGj+NmBP9/4WYGuSURe4rWZNs/BS4O6q+lZVHQP+AvjJRX1msV8tp65VUVV3AU8tah7ef/YAV45Y9PXAHVX1VFV9DbgDuGzGNU3VqLqq6kBVLXUh6KrcLuZUC/rtwE0j2md5y4UT1QTwY0nuT/KJJC+bdiFV9SXgt4HHgSPA01X1qUXd/m5bdUHyNPD8GdcE8FPd8MgtSc4fMX/S9gGvSfL8JGcxOHpfvN5Z7FfLqQtWed8a8oKqOgLQvZ47os9qb7fl1ATwnCR7k9ydZNX/MziBVdlWp0zQdxdeXQH8j1GzR7RN/XSiJWq6j8Fwzo8AvwP80SrUs47B0cAFwAuB5yX52cXdRiw6tW21zJr+GNhcVa9gMFSxhymrqgPA9QyONv+UwZ/MxxZ1W/X9apl1rfq+tUIz+fe4DJtqcFXqTwPvTfJDsy6IVdpWp0zQM7jl8X1V9eSIeUvecmG1a6qqb1TVN7v3twPfl2T9lOv5ceDRqlqoqu8AHwNetajP322rbijl+/mHfwqvak1V9dWqeqab/H3gn06xnuH13lBVF1XVaxhsg4cXdZnJfrVUXTPat4578vjwVfd6dESf1d5uy6mJqjrcvT7CYNz8lVOsablWZVudSkF/FSceIrkNeEt3lsQlDIYHjsyypiQ/cHzsO8nFDLb1V6dcz+PAJUnO6ta9FTiwqM9twPGzIN4E/Hl13wrNqqZF495XLJ4/LUnO7V43Af+Sf/i7nMl+tVRdM9q3jhvef3YAt47o80ngdUnWdX/Rva5rm1lNXS1ndu/XA68GHpxiTcu1OreLmda30JP8YfAF3leB7x9qezvw9u59GDzo5IvAA5zgG+5VrumXgf0M/vS+G3jVKm2rfw98gcFY738HzgT+A3BFN/85DIaaDgKfBV60Bmr6j0Pb6tPAD6/StvpfDP6x3w9sXQv71TLrWpV9i8F/MEeA7zA48ryawfc5dzL4K+NO4Jyu7zzwgaFlf77bxw4Cb5t1TQz+inyg22YPAFevwrb6ye79M8CTwCe7vi8Ebh9a9o3A33T72bun8bv0ylhJatypNHQjSRqDQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuP+P0jy6agiOwP5AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x26a4d4f2128>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(np.log(data['charges']), bins=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>bmi</th>\n",
       "      <th>children</th>\n",
       "      <th>charges</th>\n",
       "      <th>sex_female</th>\n",
       "      <th>sex_male</th>\n",
       "      <th>smoker_no</th>\n",
       "      <th>smoker_yes</th>\n",
       "      <th>region_northeast</th>\n",
       "      <th>region_northwest</th>\n",
       "      <th>region_southeast</th>\n",
       "      <th>region_southwest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>27.900</td>\n",
       "      <td>0</td>\n",
       "      <td>16884.92400</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>33.770</td>\n",
       "      <td>1</td>\n",
       "      <td>1725.55230</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28</td>\n",
       "      <td>33.000</td>\n",
       "      <td>3</td>\n",
       "      <td>4449.46200</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>33</td>\n",
       "      <td>22.705</td>\n",
       "      <td>0</td>\n",
       "      <td>21984.47061</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>32</td>\n",
       "      <td>28.880</td>\n",
       "      <td>0</td>\n",
       "      <td>3866.85520</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age     bmi  children      charges  sex_female  sex_male  smoker_no  \\\n",
       "0   19  27.900         0  16884.92400           1         0          0   \n",
       "1   18  33.770         1   1725.55230           0         1          1   \n",
       "2   28  33.000         3   4449.46200           0         1          1   \n",
       "3   33  22.705         0  21984.47061           0         1          1   \n",
       "4   32  28.880         0   3866.85520           0         1          1   \n",
       "\n",
       "   smoker_yes  region_northeast  region_northwest  region_southeast  \\\n",
       "0           1                 0                 0                 0   \n",
       "1           0                 0                 0                 1   \n",
       "2           0                 0                 0                 1   \n",
       "3           0                 0                 1                 0   \n",
       "4           0                 0                 1                 0   \n",
       "\n",
       "   region_southwest  \n",
       "0                 1  \n",
       "1                 0  \n",
       "2                 0  \n",
       "3                 0  \n",
       "4                 0  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.get_dummies(data)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>bmi</th>\n",
       "      <th>children</th>\n",
       "      <th>sex_female</th>\n",
       "      <th>sex_male</th>\n",
       "      <th>smoker_no</th>\n",
       "      <th>smoker_yes</th>\n",
       "      <th>region_northeast</th>\n",
       "      <th>region_northwest</th>\n",
       "      <th>region_southeast</th>\n",
       "      <th>region_southwest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>27.900</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>33.770</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>28</td>\n",
       "      <td>33.000</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>33</td>\n",
       "      <td>22.705</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>32</td>\n",
       "      <td>28.880</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age     bmi  children  sex_female  sex_male  smoker_no  smoker_yes  \\\n",
       "0   19  27.900         0           1         0          0           1   \n",
       "1   18  33.770         1           0         1          1           0   \n",
       "2   28  33.000         3           0         1          1           0   \n",
       "3   33  22.705         0           0         1          1           0   \n",
       "4   32  28.880         0           0         1          1           0   \n",
       "\n",
       "   region_northeast  region_northwest  region_southeast  region_southwest  \n",
       "0                 0                 0                 0                 1  \n",
       "1                 0                 0                 1                 0  \n",
       "2                 0                 0                 1                 0  \n",
       "3                 0                 1                 0                 0  \n",
       "4                 0                 1                 0                 0  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = data.drop('charges', axis=1)\n",
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = data['charges']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "x.fillna(0, inplace=True)\n",
    "y.fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\data.py:617: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.\n",
      "  return self.partial_fit(X, y)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "scaler = StandardScaler(with_mean=True, with_std=True).fit(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:2: DataConversionWarning: Data with input dtype uint8, int64, float64 were all converted to float64 by StandardScaler.\n",
      "  \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 0.08653871,  0.20869392, -0.13377462, ..., -0.5543127 ,\n",
       "        -0.62678317,  1.78810218],\n",
       "       [-1.34329769,  0.74643788, -0.95754468, ..., -0.5543127 ,\n",
       "         1.59544807, -0.55925216],\n",
       "       [-1.41478951,  0.08919527, -0.95754468, ..., -0.5543127 ,\n",
       "         1.59544807, -0.55925216],\n",
       "       ...,\n",
       "       [ 0.30101417, -0.18936579,  0.68999543, ..., -0.5543127 ,\n",
       "        -0.62678317, -0.55925216],\n",
       "       [ 0.08653871,  0.20869392,  0.68999543, ..., -0.5543127 ,\n",
       "        -0.62678317,  1.78810218],\n",
       "       [ 1.37339147,  0.94587145, -0.13377462, ..., -0.5543127 ,\n",
       "        -0.62678317, -0.55925216]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train_scaled = scaler.transform(x_train)\n",
    "x_test_scaled = scaler.transform(x_test)\n",
    "x_train_scaled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "poly_features = PolynomialFeatures(degree=2, include_bias=False)\n",
    "x_train_scaled = poly_features.fit_transform(x_train_scaled)\n",
    "x_test_scaled = poly_features.fit_transform(x_test_scaled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "reg = LinearRegression()\n",
    "\n",
    "reg.fit(x_train_scaled, np.log1p(y_train))\n",
    "y_predict = reg.predict(x_test_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge = Ridge(alpha=0.4)\n",
    "\n",
    "ridge.fit(x_train_scaled, np.log1p(y_train))\n",
    "y_predict_ridge = ridge.predict(x_test_scaled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "booster = GradientBoostingRegressor()\n",
    "\n",
    "booster.fit(x_train_scaled, np.log1p(y_train))\n",
    "y_predict_boost = ridge.predict(x_test_scaled)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.3536371489525544, 0.39371753546305405, 5070.812975583753, 5125.170372384354)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=reg.predict(x_train_scaled)))\n",
    "log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict))\n",
    "rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(reg.predict(x_train_scaled))))\n",
    "rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(reg.predict(x_test_scaled))))\n",
    "\n",
    "log_rmse_train, log_rmse_test, rmse_train, rmse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.3534913754377518, 0.39360130327172227, 5071.387838487504, 5136.811167129137)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=ridge.predict(x_train_scaled)))\n",
    "log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict_ridge))\n",
    "rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(ridge.predict(x_train_scaled))))\n",
    "rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(ridge.predict(x_test_scaled))))\n",
    "\n",
    "log_rmse_train, log_rmse_test, rmse_train, rmse_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.2638880698167762,\n",
       " 0.40108622040704345,\n",
       " 3738.4082462553074,\n",
       " 4586.566830607261)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "log_rmse_train = np.sqrt(mean_squared_error(y_true=np.log1p(y_train), y_pred=booster.predict(x_train_scaled)))\n",
    "log_rmse_test = np.sqrt(mean_squared_error(y_true=np.log1p(y_test), y_pred=y_predict_boost))\n",
    "rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=np.exp(booster.predict(x_train_scaled))))\n",
    "rmse_test = np.sqrt(mean_squared_error(y_true=y_test, y_pred=np.exp(booster.predict(x_test_scaled))))\n",
    "\n",
    "log_rmse_train, log_rmse_test, rmse_train, rmse_test"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
